# use this line for installing/loading# pacman::p_load()# - packages to load stored in a variable (vector)pkgs <-c("tidyverse","glue","scales","lubridate","patchwork","ggh4x","ggrepel","openintro","ggridges","dsbox","janitor","here","knitr","ggthemes","ggplot2","kableExtra","palmerpenguins","grid","htmltools","plotly","ggforce","cowplot","magick","forcats","stringr","viridis")# - load from the character array/vectorpacman::p_load(char=pkgs)# - install tidyverse/dsbox directly from Git Hub# - this allows for the possible need to install on a repo. pull.# - and, if it's already installed just thorw an alert.if (!requireNamespace("dsbox", quietly =TRUE)) {message("Installing 'dsbox' from GitHub (not found locally)...")suppressMessages(devtools::install_github("tidyverse/dsbox"))} else {message("[FYI]\n'dsbox' already installed — skipping GitHub install.")}
# - alert to user packages loaded.# Set number of columns (adjustable)n_cols <-4# Add * to each package namepkgs <-paste0("* ", pkgs)# Calculate number of rows based on total packagesn_rows <-ceiling(length(pkgs) / n_cols)# Pad with empty strings to complete gridpkgs_padded <-c(pkgs, rep("", n_rows * n_cols -length(pkgs)))# Create matrix (fill by row)pkg_matrix <-matrix(pkgs_padded, nrow = n_rows, byrow =TRUE)# Print headercat("The packages loaded:")
The packages loaded:
Code
# Loop and print each row (use invisible to suppress NULL)invisible(apply(pkg_matrix, 1, function(row) {cat(paste(format(row, width =22), collapse =""), "\n")}))
#-------------------------->####################### Basic set Theme up ######################## ---- set theme for ggplot2ggplot2::theme_set(ggplot2::theme_minimal(base_size =14))# set width of code outputoptions(width =65)# set figure parameters for knitrknitr::opts_chunk$set(fig.width =7, # 7" widthfig.asp =0.618, # the golden ratiofig.retina =3, # dpi multiplier for displaying HTML output on retinafig.align ="center", # center align figuresdpi =300# higher dpi, sharper image)## ---- end theme set up
(>>>>) - function block
In an effort to reduce repeating code a function block was created.
Code
# ............ A function block, to handle Q3,Q4 with minimal code duplication# - size as a variableset_dot_size <-1# Function for the "All" group plot (g0)plot_all <-function(data) {ggplot(data, aes(x = explanatory_value, y = mean)) +geom_errorbar(aes(ymin = low, ymax = high), width =0.2) +geom_point(size = set_dot_size, color ="black") +coord_flip() +facet_grid(rows =vars(explanatory),cols =vars(response),labeller =labeller(response =as_labeller(response_labels),explanatory =as_labeller(explanatory_labels) ) ) +theme_minimal(base_size =11) +labs(title ="COVID-19 Vaccine Attitudes by Demographic Group",x =NULL,y =NULL ) +theme(panel.spacing.x =unit(0.1, "lines"), # Increase spacing between columnsplot.title =element_text(hjust =0.5),strip.background =element_rect(fill = strip_fill_color, color ="black"),strip.placement = strip_placement,strip.text.x =element_text(vjust =0.5,size = strip_text_size,margin =margin(t =20, b =10, r =10, l =10) ),strip.text.y.right =element_text(angle =0,vjust =0.5,margin =margin(t =10, b =10, r =28, l =28) ),axis.text.y =element_blank(),axis.text.x =element_blank(),axis.ticks.x =element_blank() )}# Function for the Age plot (g1)plot_age <-function(data) {ggplot(data, aes(x = explanatory_value, y = mean, group = explanatory_value)) +geom_errorbar(aes(ymin = low, ymax = high), width =0.2) +geom_point(size = set_dot_size, color ="black") +coord_flip() +facet_grid(rows =vars(explanatory),cols =vars(response),labeller =labeller(explanatory =as_labeller(explanatory_labels) ) ) +theme_minimal(base_size =12) +labs(x =NULL,y =NULL ) +theme(panel.spacing.x =unit(0.1, "lines"), # Increase spacing between columnsstrip.background =element_rect(fill = strip_fill_color, color ="black"),strip.placement = strip_placement,strip.text.x =element_blank(),strip.text.y.right =element_text(angle =0,vjust =0.5,margin =margin(t =10, b =10, r =25, l =25) ),axis.text.y =element_text(size =10),panel.spacing =unit(1, "lines"),axis.text.x =element_blank(),axis.ticks.x =element_blank() )}# Function for the Gender plot (g2)plot_gender <-function(data) {ggplot(data, aes(x = explanatory_value, y = mean, group = explanatory_value)) +geom_errorbar(aes(ymin = low, ymax = high), width =0.2) +geom_point(size = set_dot_size, color ="black") +coord_flip() +facet_grid(rows =vars(explanatory),cols =vars(response),labeller =labeller(explanatory =as_labeller(explanatory_labels) ) ) +theme_minimal(base_size =12) +labs(x =NULL,y =NULL ) +theme(panel.spacing.x =unit(0.1, "lines"), # Increase spacing between columnsstrip.background =element_rect(fill = strip_fill_color, color ="black"),strip.placement = strip_placement,strip.text.x =element_blank(),strip.text.y.right =element_text(angle =0,vjust =0.5,margin =margin(t =10, b =10, r =17, l =17) ),axis.text.y =element_text(size =10),axis.text.x =element_blank(),panel.spacing =unit(1, "lines"),axis.ticks.x =element_blank() )}# Function for the Race plot (g3)plot_race <-function(data) {ggplot(data, aes(x = explanatory_value, y = mean, group = explanatory_value)) +geom_errorbar(aes(ymin = low, ymax = high), width =0.2) +geom_point(size = set_dot_size, color ="black") +coord_flip() +facet_grid(rows =vars(explanatory),cols =vars(response),labeller =labeller(explanatory =as_labeller(explanatory_labels) ) ) +theme_minimal(base_size =12) +labs(x =NULL,y =NULL ) +theme(panel.spacing.x =unit(0.1, "lines"), # Increase spacing between columnsstrip.background =element_rect(fill = strip_fill_color, color ="black"),strip.placement = strip_placement,strip.text.x =element_blank(),strip.text.y.right =element_text(angle =0,vjust =0.5,margin =margin(t =10, b =10, r =22, l =22) ),axis.text.y =element_text(size =10),panel.spacing =unit(1, "lines"),axis.text.x =element_blank(),axis.ticks.x =element_blank() )}# Function for the Ethnicity plot (g4)plot_ethnicity <-function(data, sub_title_specific) {ggplot(data, aes(x = explanatory_value, y = mean, group = explanatory_value)) +geom_errorbar(aes(ymin = low, ymax = high), width =0.2) +geom_point(size = set_dot_size, color ="black") +coord_flip() +facet_grid(rows =vars(explanatory),cols =vars(response),labeller =labeller(explanatory =as_labeller(explanatory_labels) ) ) +theme_minimal(base_size =10) +labs(x =NULL,y =NULL ) +theme(panel.spacing.x =unit(0.1, "lines"), # Increase spacing between columnsstrip.background =element_rect(fill = strip_fill_color, color ="black"),strip.placement = strip_placement,strip.text.x =element_blank(),strip.text.y.right =element_text(angle =0,vjust =0.5,margin =margin(t =10, b =10, r =18, l =18) ),axis.text.y =element_text(size =10),axis.text.x =element_blank(),axis.ticks.x =element_blank(),panel.spacing =unit(1, "lines") )}# Function for the Profession plot (g5)plot_profession <-function(data, sub_title_specific) {ggplot(data, aes(x = explanatory_value, y = mean, group = explanatory_value)) +geom_errorbar(aes(ymin = low, ymax = high), width =0.2) +geom_point(size = set_dot_size, color ="black") +coord_flip() +facet_grid(rows =vars(explanatory),cols =vars(response),labeller =labeller(explanatory =as_labeller(explanatory_labels) ) ) +theme_minimal(base_size =10) +labs(x =NULL,y =NULL ) +theme(panel.spacing.x =unit(0.1, "lines"), # Increase spacing between columnsstrip.background =element_rect(fill = strip_fill_color, color ="black"),strip.placement = strip_placement,strip.text.x =element_blank(),strip.text.y.right =element_text(angle =0,vjust =0.5,margin =margin(t =10, b =10, r =14, l =14) ),axis.text.y =element_text(size =10),axis.text.x =element_blank(),axis.ticks.x =element_blank(),panel.spacing =unit(1, "lines") )}# Function for the Vax(COVID) plot (g6)plot_vax_status <-function(data, sub_title_specific) {ggplot(data, aes(x = explanatory_value, y = mean, group = explanatory_value)) +geom_errorbar(aes(ymin = low, ymax = high), width =0.2) +geom_point(size = set_dot_size, color ="black") +coord_flip() +facet_grid(rows =vars(explanatory),cols =vars(response),labeller =labeller(explanatory =as_labeller(explanatory_labels) ) ) +theme_minimal(base_size =10) +labs(x =NULL,y =NULL ) +theme(panel.spacing.x =unit(0.1, "lines"), # Increase spacing between columnsstrip.background =element_rect(fill = strip_fill_color, color ="black"),strip.placement = strip_placement,strip.text.x =element_blank(),strip.text.y.right =element_text(angle =0,vjust =0.5,margin =margin(t =10, b =10, r =12, l =12) ),axis.text.y =element_text(size =10),axis.text.x =element_blank(),axis.ticks.x =element_blank(),panel.spacing =unit(1, "lines") )}# (g7) - flu Vxplot_flu_vax_status <-function(data, sub_title_specific) {ggplot(data, aes(x = explanatory_value, y = mean, group = explanatory_value)) +geom_errorbar(aes(ymin = low, ymax = high), width =0.2) +geom_point(size = set_dot_size, color ="black") +coord_flip() +facet_grid(rows =vars(explanatory),cols =vars(response),labeller =labeller(explanatory =as_labeller(explanatory_labels) ) ) +theme_minimal(base_size =10) +labs(x =NULL,y =paste0("Mean Likert score\n(Error bars: ", sub_title_specific, ")") ) +theme(panel.spacing.x =unit(0.1, "lines"),strip.background =element_rect(fill = strip_fill_color, color ="black"),strip.placement = strip_placement,strip.text.x =element_blank(),strip.text.y.right =element_text(angle =0,vjust =0.5,margin =margin(t =10, b =10, r =4, l =7) ),axis.text.y =element_text(size =10),axis.text.x =element_text(size =10), # <- show x-axis Likert labelsaxis.ticks.x =element_line(), # <- show x-axis tickspanel.spacing =unit(1, "lines"))}# ..... prepare the variables.# . ethnicity.filter_ethnicity_data <-function(data) { data %>%filter(explanatory =="exp_ethnicity") %>%filter(is.finite(mean), is.finite(low), is.finite(high)) %>%mutate(explanatory_value =recode(as.character(explanatory_value),"1"="Hispanic/Latino","2"="Non-Hispanic/Non-Latino"),explanatory_value =factor(explanatory_value, levels =c("Hispanic/Latino", "Non-Hispanic/Non-Latino" )),explanatory =factor(explanatory, levels =c("All", "exp_age_bin", "exp_gender", "exp_race", "exp_ethnicity" )) )}# . agefilter_age_data <-function(data) { data %>%filter(explanatory =="exp_age_bin") %>%filter(is.finite(mean), is.finite(low), is.finite(high)) %>%mutate(explanatory_value =recode(as.character(explanatory_value),"0"="<20","20"="21-25","25"="26-30","30"=">30" ),explanatory_value =factor(explanatory_value, levels =c("<20", "21-25", "26-30", ">30")),explanatory =factor(explanatory, levels =c("All", "exp_age_bin", "exp_gender", "exp_race")) )}# . genderfilter_gender_data <-function(data) { data %>%filter(explanatory =="exp_gender") %>%filter(is.finite(mean), is.finite(low), is.finite(high)) %>%mutate(explanatory_value =as.character(explanatory_value),explanatory_value =fct_recode(factor(explanatory_value),"Prefer not to say"="4","Non-binary third gender"="3","Male"="0","Female"="1" ),explanatory_value =factor(explanatory_value, levels =rev(c("Prefer not to say","Non-binary third gender","Male","Female" ))),explanatory =factor(explanatory, levels =c("All", "exp_age_bin", "exp_gender", "exp_race")) )}# . racefilter_race_data <-function(data) { data %>%filter(explanatory =="exp_race") %>%filter(is.finite(mean), is.finite(low), is.finite(high)) %>%mutate(explanatory_value =recode(as.character(explanatory_value),"1"="American Indian/Alaska Native","2"="Asian","3"="Black/African American","4"="Native Hawaiian/Other Pacific Islander","5"="White" ),explanatory_value =factor(explanatory_value, levels =rev(c("White","Native Hawaiian/Other Pacific Islander","Black/African American","Asian","American Indian/Alaska Native" ))),explanatory =factor(explanatory, levels =c("All", "exp_age_bin", "exp_gender", "exp_race")) )}# . medical profssionfilter_profession_data <-function(data) { data %>%filter(explanatory =="exp_profession") %>%filter(is.finite(mean), is.finite(low), is.finite(high)) %>%mutate(explanatory_value =recode(as.character(explanatory_value),"0"="Medical","1"="Nursing" ),explanatory_value =factor(explanatory_value, levels =c("Nursing", "Medical")),explanatory =factor(explanatory, levels =c("All", "exp_age_bin", "exp_gender", "exp_race", "exp_profession")) )}# . had covid vaxfilter_vax_status_data <-function(data) { data %>%filter(explanatory =="exp_already_vax") %>%filter(is.finite(mean), is.finite(low), is.finite(high)) %>%mutate(explanatory_value =recode(as.character(explanatory_value),"0"="No","1"="Yes" ),explanatory_value =factor(explanatory_value, levels =c("No", "Yes")),explanatory =factor(explanatory, levels =c("All", "exp_age_bin", "exp_gender", "exp_race", "exp_profession", "exp_already_vax")) )}# . had FLU vaxfilter_flu_vax_data <-function(data) { data %>%filter(explanatory =="exp_flu_vax") %>%filter(is.finite(mean), is.finite(low), is.finite(high)) %>%mutate(explanatory_value =recode(as.character(explanatory_value),"0"="No","1"="Yes" ),explanatory_value =factor(explanatory_value, levels =c("No", "Yes")),explanatory =factor(explanatory, levels =c("All", "exp_age_bin", "exp_gender", "exp_race", "exp_profession", "exp_already_vax", "exp_flu_vax")) )}
1 - Du Bois challenge.
Du Bois challenge. Recreate the following visualization by W.E.B. Du Bois on family budgets split by income classes for 150 families in Atlanta, Georgia. This visualization was originally created using ink and watercolors.
Note: Since there appears to be some allowable creativity with the features reperesented. I left a scale on the bottom of the parchment, and left off the ‘connecting lines’ connecting the same colored segments together for the stacked bar charts. It ended up being a lot of code - to separately construct and place all pieces of the chart together. First effort. There may be a more efficient way to re-make the plot? . I rendered the output image as html - and I cannot git rid of the small ‘png 2’ label (atm).
png 2
A recreation of ‘Income and Expenditure of 150 Negro Families in Atlanta, GA, USA’ Some re-interpretations were taken: - connecting lines between stacked bar chart areas were left off - slightly modified ‘}’ grouping labels were used on right hand side of bar charts. - an axis was rendered on the bottom to help visualize scale
2 - COVID survey - interpretation
Q2 - Interpret what’s occurring in the survey, and discuss any results that go against your intuition. In a chart this large, “interpret” (as opposed to simply describing) really means identifying trends in the data. Overall description The COVID vaccine survey gathered responses from medical and nursing students across the U.S. regarding their attitudes toward vaccine safety, trust, and recommendations. The visualization arranges responses in a grid, with response variables in columns and explanatory variables (like age, profession, or gender) in rows. Each pane displays the mean Likert score and error bars between the 10th and 90th percentiles for each subgroup, offering insight into both central tendency and variability. The top row summarizes overall distributions, unconditioned by explanatory factors. - Interesting Trends in the Data: 1. Trust and Profession: Medical students displayed more variability in their agreement with the statement “I trust the information that I have received about the vaccines” compared to nursing students. While both groups leaned toward agreement, the broader spread among medical students suggests more diverse opinions, possibly reflecting deeper exposure to varying sources of information or a more analytical approach to evaluating it. 2. Concern About Side Effects and Age: Across all age groups, responses to concerns about “safety and side effects” hovered around a neutral average (Likert score ≈ 3), with relatively wide error bars. This indicates uncertainty or ambivalence. However, younger students tended to show slightly higher trust (i.e. lower concern scores), suggesting that age may play a role in perceived vaccine risk. 3. There is perhaps some counter-intuition at play here among nursing students who responded to the question: “Based on my understanding, I believe this vaccine is safe.” The 10–90th percentile bars span the entire Likert scale, suggesting considerable variability in responses. While this reflects some uncertainty, it may stem from the ambiguity of what understanding means in this context. Perhaps students interpret understanding as requiring a solid grasp of virology—something not all nursing students may have studied in depth. Alternatively, they may associate it with the rapid development timeline of the vaccine, leading to concerns about whether it was produced safely. It seems unlikely, however, that true denial of vaccine safety is the dominant interpretation among nursing students.
Overall, the data reveal meaningful variation in how medical and nursing students interpret the science and safety of COVID-19 vaccines, highlighting the complexity of attitudes even within healthcare education.
Code
#------- no code necessary ..
3 - COVID survey - reconstruct
Q3 ….
Data Analysis - Q1
📄 The original data frame (raw_preview) has:
- 1123 rows
- 14 columns
✅ Rows with only `response_id` and all other fields missing have been removed.
Original dataset rows: 1121
Rows removed: 10
Cleaned dataset size: 1111 rows × 14 columns
**Rows_Removed**
row:3
row:152
row:153
row:414
row:529
row:556
row:577
row:835
row:987
row:1050
Code
# - Step 1a: print the dim of the original df.original_dim <-dim(raw_preview)cat(glue("📄 The original data frame (`raw_preview`) has:\n","- {original_dim[1]} rows\n","- {original_dim[2]} columns\n\n","⚠️ Rows with no available data (i.e., only `response_id` present)\n will be removed in preprocessing.\n","\n✅ **New Dimensions** of `survey_clean` after cleaning:\n","📊 Rows: {nrow(survey_clean)}\n","📐 Columns: {ncol(survey_clean)}\n"))
📄 The original data frame (`raw_preview`) has:
- 1123 rows
- 14 columns
⚠️ Rows with no available data (i.e., only `response_id` present)
will be removed in preprocessing.
✅ **New Dimensions** of `survey_clean` after cleaning:
📊 Rows: 1111
📐 Columns: 14
Code
#-- ... --- based on info in pdf file and .csv .. encode the followingcovid_survey_longer <- survey_clean |>pivot_longer(cols =starts_with("exp_"),names_to ="explanatory",values_to ="explanatory_value" ) |>mutate(explanatory_value =as.factor(explanatory_value)) |>filter(!is.na(explanatory_value)) |>pivot_longer(cols =starts_with("resp_"),names_to ="response",values_to ="response_value" )print(covid_survey_longer)
first pivot_longer(): Converts all columns that start with “exp_” (e.g., exp_profession, exp_gender, etc.) from wide format into long format. Creates two new columns: explanatory: holds the original column names (like “exp_profession”) explanatory_value: holds the actual values from those columns (like “Nursing” or “1”) second pivot_longer(): After already pivoting the explanatory variables, this takes the remaining response variables (resp_safety, resp_confidence_science, etc.) and pivots them long as well. Creates two new columns: response: original column name response_value: corresponding value
create the df/tibble: covid_survey_summary_stats_by_group
Code
# - group the data - by explanatory, explanatory_value, and response calc.# - the following stats:# - mean of the response_value# - low 10th percentile of the response_value# - high 90th percentile of the response_value# - rename the df coivd_survey_summart_stats_by_groupcovid_survey_summary_stats_by_group <- covid_survey_longer |>group_by(explanatory, explanatory_value, response) |>summarise(mean =mean(response_value, na.rm =TRUE),low =quantile(response_value, probs =0.10, na.rm =TRUE),high =quantile(response_value, probs =0.90, na.rm =TRUE),.groups ="drop" )print(covid_survey_summary_stats_by_group)
# A tibble: 6 × 6
response mean low high explanatory explanatory_value
<chr> <dbl> <dbl> <dbl> <chr> <fct>
1 resp_concern_s… 3.28 1 5 All ""
2 resp_confidenc… 1.43 1 2 All ""
3 resp_feel_safe… 1.36 1 2 All ""
4 resp_safety 2.03 1 5 All ""
5 resp_trust_info 1.40 1 2 All ""
6 resp_will_reco… 1.21 1 2 All ""
Code
#View(covid_survey_summary_stats_all)
Bind the two df’s create the df/tibble: covid_summary_of_stats
Code
# Get existing levels from grouped dataage_levels <-levels(covid_survey_summary_stats_by_group$explanatory_value)# Add a new level to represent the 'All' groupage_levels_with_all <-c(age_levels, "")# Create the all-summary with the new factor levelcovid_survey_summary_stats_all <- covid_survey_longer |>group_by(response) |>summarise(mean =mean(response_value, na.rm =TRUE),low =quantile(response_value, probs =0.10, na.rm =TRUE),high =quantile(response_value, probs =0.90, na.rm =TRUE),explanatory ="All",explanatory_value =factor("", levels = age_levels_with_all),.groups ="drop" )# Ensure grouped summary has the same levels toocovid_survey_summary_stats_by_group$explanatory_value <-factor( covid_survey_summary_stats_by_group$explanatory_value,levels = age_levels_with_all)# Bind them safely nowcovid_survey_summary_stats <-bind_rows( covid_survey_summary_stats_all, covid_survey_summary_stats_by_group)print(covid_survey_summary_stats)
# A tibble: 132 × 6
response mean low high explanatory explanatory_value
<chr> <dbl> <dbl> <dbl> <chr> <fct>
1 resp_concern_… 3.28 1 5 All ""
2 resp_confiden… 1.43 1 2 All ""
3 resp_feel_saf… 1.36 1 2 All ""
4 resp_safety 2.03 1 5 All ""
5 resp_trust_in… 1.40 1 2 All ""
6 resp_will_rec… 1.21 1 2 All ""
7 resp_concern_… 3.35 2 4.4 exp_age_bin "0"
8 resp_confiden… 1.65 1 2.4 exp_age_bin "0"
9 resp_feel_saf… 1.71 1 3.8 exp_age_bin "0"
10 resp_safety 1.41 1 2 exp_age_bin "0"
# ℹ 122 more rows
# Labels for rows (explanatory variables), including Gender and Raceexplanatory_labels <-c(All ="All",exp_age_bin ="Age",exp_gender ="Gender",exp_race ="Race" ,# Added Race labelexp_ethnicity ="Ethnicity",exp_profession ="Profession",exp_already_vax ="Had COVID\nVaccine",exp_flu_vax ="Had flu Vaccine\nthis year")# - call formatting for encoded datacovid_age_only <-filter_age_data(covid_survey_summary_stats_by_group)covid_gender_only <-filter_gender_data(covid_survey_summary_stats_by_group)covid_race_only <-filter_race_data(covid_survey_summary_stats_by_group)covid_ethnicity_only <-filter_ethnicity_data(covid_survey_summary_stats_by_group)covid_profession_only <-filter_profession_data(covid_survey_summary_stats_by_group)covid_covid_vax_only <-filter_vax_status_data(covid_survey_summary_stats_by_group)covid_flu_vax_only <-filter_flu_vax_data(covid_survey_summary_stats_by_group)# Label mappings for responseresponse_labels <-c(resp_safety ="Based on my\n understanding I\n belive the vaccine\n is safe.",resp_feel_safe_at_work ="Getting the vaccine\nwill make me feel\nsafer at work.",resp_concern_safety ="I am concerned\nabout the safety\nand side effects of\nthe vaccine.",resp_confidence_science ="I am confident in\n the scientific\n vetting process for\n the new COVID\vaccines.",resp_trust_info ="I trust the\ninformation that I\nhave received about\nthe vaccines",resp_will_recommend ="I will recommend\nthe vaccine to\nfamily, friends,\nand community\nmembers.")# Reorder response factor levels to match response_labelscovid_age_only <- covid_age_only %>%mutate(response =factor(response, levels =names(response_labels)))covid_gender_only <- covid_gender_only %>%mutate(response =factor(response, levels =names(response_labels)))# View distinct codes used in the exp_ethnicity variable# covid_survey_summary_stats_by_group %>%# filter(explanatory == "exp_ethnicity") %>%# mutate(explanatory_value = as.character(explanatory_value)) %>%# distinct(explanatory_value) %>%# arrange(explanatory_value)# - medical profession#filter_profession_data# Vector controlling heights of each row - add height for racerow_heights <-c(0.5, # - 'All' row height — adjust as needed3, # - 'exp_age_bin' row height3, # - 'exp_gender' row height - adjust as desired3, # - 'exp_race' row height - new Race row3, # - ethnicity2, # - profession2, # - Covid Vax2# -Flu Vax)# Reorder response factor levels for 'All' layercovid_all_only <- covid_survey_summary_stats_all %>%filter(is.finite(mean), is.finite(low), is.finite(high)) %>%mutate(response =factor(response, levels =names(response_labels)))# - vars for standardizing box size row/col# Define variables for strip appearancestrip_fill_color <-"gray90"strip_text_color <-"black"strip_text_size <-9strip_text_face <-"plain"strip_text_angle_x <-0strip_text_angle_y <-0strip_text_vjust_y <-0.5strip_placement <-"outside"# already used in your code# Call some functionsg0 <-plot_all(covid_all_only)# - second layer - Ageg1 <-plot_age(covid_age_only)# - third layer - genderg2 <-plot_gender(covid_gender_only)# Fourth layer - Raceg3 <-plot_race(covid_race_only)# Fifth layer: Ethnicity (if present)g4 <-plot_ethnicity(covid_ethnicity_only,"")#g5 <-plot_profession(covid_profession_only,"")#g6 <-plot_vax_status(covid_covid_vax_only,"")#g7 <-plot_flu_vax_status(covid_flu_vax_only,"Error bars in range from 10th to 90th percentile")# Composite plot with 5 layers stacked (All / Age / Gender / Race / Ethnicity)composite_plot <- (g0 / g1 / g2 / g3 / g4 / g5 /g6 /g7 +plot_layout(heights = row_heights)) &theme(plot.margin =margin(0, 0, 0, 0))print(composite_plot)
4 - COVID survey - re-reconstruct
Q4 ….Make Plot from Q3, but use different end point quarantiles. When the error bars represent the 25th and 75th percentiles instead of the 10th and 90th, the intervals become narrower, reflecting a tighter range around the median of the data. This change reduces the apparent variability and uncertainty in responses. Compared to the previous plot, the shorter error bars may make the group differences appear more precise but potentially understate the true variability. Therefore, while the overall trends remain similar, conclusions about the degree of uncertainty should be adjusted to recognize that the interquartile range excludes more extreme values.
# A tibble: 132 × 6
response mean low high explanatory explanatory_value
<chr> <dbl> <dbl> <dbl> <chr> <fct>
1 resp_concern_… 3.28 2 4 All ""
2 resp_confiden… 1.43 1 2 All ""
3 resp_feel_saf… 1.36 1 1 All ""
4 resp_safety 2.03 1 3 All ""
5 resp_trust_in… 1.40 1 2 All ""
6 resp_will_rec… 1.21 1 1 All ""
7 resp_concern_… 3.35 2 4 exp_age_bin "0"
8 resp_confiden… 1.65 1 2 exp_age_bin "0"
9 resp_feel_saf… 1.71 1 2 exp_age_bin "0"
10 resp_safety 1.41 1 2 exp_age_bin "0"
# ℹ 122 more rows
5 - COVID survey - another view
Q5a …. COVID survey - another view. Create two bar charts of the Likert data for the six survey questions in from the plot in Exercise 2. This should be a single plot visualizing the percentages of each possible answer, with different questions on the y-axis. Use an appropriate color scale.
a. Create a diverging bar chart. Write alt text for your visualization as well. Write alt text for your visualization as well.
Code
# - Using Likert Data for 6 survey questions, create a diverging bar chart.# Response labels (questions)response_labels <-c(resp_safety ="Vaccine is safe",resp_feel_safe_at_work ="Feel safer at work",resp_concern_safety ="Concern re : vaccine side effects",resp_confidence_science ="Confidence in scientific vetting",resp_trust_info ="Trust in vaccine info",resp_will_recommend ="Will recommend vaccine")# Likert response mappinglikert_scores <-c("1"="Strongly Agree","2"="Somewhat Agree","3"="Neither Agree Nor Disagree","4"="Somewhat Disagree","5"="Strongly Disagree")# Your custom viridis palette (turquoise to yellow-green)my_turquoise_to_yellow <-viridis(12,begin =0.25,end =0.85,option ="viridis")[c(1, 3, 5, 7, 9, 11)]# 1. Clean and assign numeric scores for mean computationlikert_means <- covid_survey_longer %>%filter( response %in%names(response_labels),!is.na(response_value), response_value !="" ) %>%distinct(response_id, response, .keep_all =TRUE) %>%mutate(response_label =factor(response_labels[response], levels = response_labels),response_numeric =as.numeric(response_value) ) %>%group_by(response_label) %>%summarize(mean_score =mean(response_numeric, na.rm =TRUE),sd_score =sd(response_numeric, na.rm =TRUE),n =n() ) %>%ungroup()# 2. Calculate overall mean for centeringoverall_mean <-mean(likert_means$mean_score, na.rm =TRUE)# 3. Assign color scale (optional: color by mean score direction)likert_means <- likert_means %>%mutate(centered_score = mean_score - overall_mean,direction =ifelse(centered_score <0, "positive", "negative") )# Add magma color mapping for Likert scoreslikert_colors <- viridis::viridis(5, option ="magma")# Prepare Likert labels with colorlikert_xlabels <-data.frame(likert_score =1:5,centered_score =1:5- overall_mean,color = likert_colors,label =as.character(1:5))# Pick a magma tone (e.g., middle of the scale)magma_color <- viridis::viridis(1, option ="magma")g_centered <-ggplot(likert_means, aes(x = centered_score, y =fct_rev(response_label), fill = direction)) +geom_col(width =0.6) +annotate("segment",x =0, xend =0,y =0.5, yend =6.5, # Adjust y range to control vertical spanlinetype ="dashed",color ="black",linewidth =0.8 ) +scale_x_continuous(name ="Deviation from Overall Mean Likert Score",limits =c(-1.5, 1.5),breaks =seq(-1.5, 1.5, by =0.5),sec.axis =dup_axis(name =NULL, labels =NULL) ) +scale_y_discrete(expand =expansion(mult =c(0.05, 0.32)) ) +scale_fill_manual(values =c(positive = my_turquoise_to_yellow[2], negative = my_turquoise_to_yellow[5]),name ="Likert Score Relation",labels =c(positive ="Below Mean", negative ="Above Mean") ) +labs(title ="Mean-Centered Likert Scores by Question\n",y =NULL ) +annotate("segment",x =0.45, xend =0,y =1.3, yend =1.6,color ="black",arrow =arrow(length =unit(0.2, "cm")),linewidth =0.6 ) +annotate("label",x =0.5, y =1.1,label =sprintf("Mean = %.2f", overall_mean),hjust =0,vjust =0,size =4.5,fill ="white",color ="black",label.size =0.4,label.r =unit(0.15, "lines"),label.padding =unit(0.3, "lines") ) +# Likert numbers colored with magma palettegeom_text(data = likert_xlabels,aes(x = centered_score, y =6.8, label = likert_score),inherit.aes =FALSE,size =7,color = likert_colors ) +# Likert Scores label colored in magma (middle color)annotate("text",x =0, y =7.4,label ="Likert Scores",size =7,fontface ="italic",color = likert_colors[3] ) +theme_minimal() +theme(plot.title =element_text(size =16, face ="bold", hjust =0.5),axis.text.y =element_text(size =15, lineheight =2),axis.text.x =element_text(size =14),axis.title.x.top =element_text(size =10, color ="gray30", face ="italic", hjust =1),legend.position ="bottom",legend.title =element_text(size =12, face ="bold"),legend.text =element_text(size =11),plot.margin =margin(t =10, r =10, b =10, l =10) )plot(g_centered)
Q5a - Alt text A horizontal diverging bar chart presents mean-centered Likert scores for six COVID vaccine survey questions: “Vaccine is safe,” “Feel safer at work,” “Concern re : vaccine side effects,” “Confidence in scientific vetting,” “Trust in vaccine info,” and “Will recommend vaccine.” Each bar extends from a vertical midpoint at the mean response (1.79) to show how average responses deviate above or below the overall mean, with greater lengths indicating stronger deviation. Turquoise segments indicate questions with more agreement (e.g., stronger confidence or willingness to recommend), while yellow-green segments reflect comparatively more skepticism. The chart includes an annotated mean marker, and Likert scale values from “1 - Strongly Agree” to “5 - Strongly Disagree” appear along the top axis in magma tones to reinforce the interpretive range. All questions except those concerning vaccine safety and side effects tend towards ‘1 - Strongly Agree.’ The scores for ‘Vaccine is safe’ and ‘concerns about vaccine side effects’, while above the mean, did not score into Likert-4 or Likert-5, indicating uncertainty or abmivalence but not concern.
Q5b …. COVID survey - another view. Create two bar charts of the Likert data for the six survey questions in from the plot in Exercise 2. This should be a single plot visualizing the percentages of each possible answer, with different questions on the y-axis. Use an appropriate color scale. b. Create a 100% bar chart Write alt text for your visualization as well.
Code
# Full Likert labels - for the legendlikert_labels_full <-c("1"="1 - Strongly Agree","2"="2 - Somewhat Agree","3"="3 - Neither Agree Nor Disagree","4"="4 - Somewhat Disagree","5"="5 - Strongly Disagree")# Filter and prepare proportionslikert_props <- covid_survey_longer %>%filter(response %in%names(response_labels), !is.na(response_value), response_value !="") %>%mutate(response_label =factor(response_labels[response], levels = response_labels),likert_level =factor(likert_labels_full[response_value], levels = likert_labels_full) ) %>%group_by(response_label, likert_level) %>%summarize(n =n(), .groups ="drop") %>%group_by(response_label) %>%mutate(percentage = n /sum(n) *100)# Build 100% stacked bar chart with full Likert labelsg5b <-ggplot(likert_props, aes(x = percentage, y =fct_rev(response_label), fill = likert_level)) +geom_col(width =0.7) +scale_fill_viridis_d(option ="viridis",begin =0.2,end =0.85,direction =-1,name ="Response" ) +labs(title ="Distribution of Likert Responses by Question",x ="Percentage",y =NULL ) +theme_minimal() +theme(plot.title =element_text(size =18, face ="bold", hjust =0.5),legend.position ="right",legend.title =element_text(size =12, face ="bold"),legend.text =element_text(size =11),axis.text.y =element_text(size =14),axis.text.x =element_text(size =14) )plot(g5b)
Q5b - Alt text A horizontal stacked bar chart displays the distribution of Likert-scale responses to six survey questions on COVID vaccine attitudes, including: “Vaccine is safe,” “Feel safer at work,” “Concern re: vaccine side effects,” “Confidence in scientific vetting,” “Trust in vaccine info,” and “Will recommend vaccine.” Each bar represents a question, divided into colored segments for each response level from “1 - Strongly Agree” to “5 - Strongly Disagree,” using a perceptually uniform Viridis palette. Most responses cluster on the agreement end of the scale, especially for trust and confidence statements, while the item “Concern about vaccine safety” shows a more neutral distribution centered around “Somewhat Agree” and “Neither Agree Nor Disagree.” The visual design emphasizes sentiment distribution and is optimized for accessibility, including colorblind-friendly contrast and consistent labeling.
Source Code
---title: "HW 03"author: "Nathan Herling"date: "2025-06-20"_due: "Friday-June-13-2025" format: html: embed-resources: true code-fold: true code-tools: true toc: true css: styles.css # ✅ Add this line to link your CSS file---```{r setup}#--------------------->################## Package Setup ###################Check if pacman [package manager] is installed, if not install it.#throw [FYI] alert either way.if (!requireNamespace("pacman", quietly = TRUE)) { message("Installing 'pacman' (not found locally)...") install.packages("pacman")} else { message("[FYI]\n'pacman' already installed — skipping install.")}# use this line for installing/loading# pacman::p_load()# - packages to load stored in a variable (vector)pkgs <- c( "tidyverse", "glue", "scales", "lubridate", "patchwork", "ggh4x", "ggrepel", "openintro", "ggridges", "dsbox", "janitor", "here", "knitr", "ggthemes", "ggplot2", "kableExtra", "palmerpenguins", "grid", "htmltools", "plotly", "ggforce", "cowplot", "magick", "forcats", "stringr", "viridis")# - load from the character array/vectorpacman::p_load(char=pkgs)# - install tidyverse/dsbox directly from Git Hub# - this allows for the possible need to install on a repo. pull.# - and, if it's already installed just thorw an alert.if (!requireNamespace("dsbox", quietly = TRUE)) { message("Installing 'dsbox' from GitHub (not found locally)...") suppressMessages(devtools::install_github("tidyverse/dsbox"))} else { message("[FYI]\n'dsbox' already installed — skipping GitHub install.")}# - alert to user packages loaded.# Set number of columns (adjustable)n_cols <- 4# Add * to each package namepkgs <- paste0("* ", pkgs)# Calculate number of rows based on total packagesn_rows <- ceiling(length(pkgs) / n_cols)# Pad with empty strings to complete gridpkgs_padded <- c(pkgs, rep("", n_rows * n_cols - length(pkgs)))# Create matrix (fill by row)pkg_matrix <- matrix(pkgs_padded, nrow = n_rows, byrow = TRUE)# Print headercat("The packages loaded:")# Loop and print each row (use invisible to suppress NULL)invisible(apply(pkg_matrix, 1, function(row) { cat(paste(format(row, width = 22), collapse = ""), "\n")}))#-------------------------->####################### Basic set Theme up ######################## ---- set theme for ggplot2ggplot2::theme_set(ggplot2::theme_minimal(base_size = 14))# set width of code outputoptions(width = 65)# set figure parameters for knitrknitr::opts_chunk$set( fig.width = 7, # 7" width fig.asp = 0.618, # the golden ratio fig.retina = 3, # dpi multiplier for displaying HTML output on retina fig.align = "center", # center align figures dpi = 300 # higher dpi, sharper image)## ---- end theme set up```## (>>>>) - function block<div class="question-box">In an effort to reduce repeating code a function block was created.</div>```{r}#| label: label-me-007# ............ A function block, to handle Q3,Q4 with minimal code duplication# - size as a variableset_dot_size <-1# Function for the "All" group plot (g0)plot_all <-function(data) {ggplot(data, aes(x = explanatory_value, y = mean)) +geom_errorbar(aes(ymin = low, ymax = high), width =0.2) +geom_point(size = set_dot_size, color ="black") +coord_flip() +facet_grid(rows =vars(explanatory),cols =vars(response),labeller =labeller(response =as_labeller(response_labels),explanatory =as_labeller(explanatory_labels) ) ) +theme_minimal(base_size =11) +labs(title ="COVID-19 Vaccine Attitudes by Demographic Group",x =NULL,y =NULL ) +theme(panel.spacing.x =unit(0.1, "lines"), # Increase spacing between columnsplot.title =element_text(hjust =0.5),strip.background =element_rect(fill = strip_fill_color, color ="black"),strip.placement = strip_placement,strip.text.x =element_text(vjust =0.5,size = strip_text_size,margin =margin(t =20, b =10, r =10, l =10) ),strip.text.y.right =element_text(angle =0,vjust =0.5,margin =margin(t =10, b =10, r =28, l =28) ),axis.text.y =element_blank(),axis.text.x =element_blank(),axis.ticks.x =element_blank() )}# Function for the Age plot (g1)plot_age <-function(data) {ggplot(data, aes(x = explanatory_value, y = mean, group = explanatory_value)) +geom_errorbar(aes(ymin = low, ymax = high), width =0.2) +geom_point(size = set_dot_size, color ="black") +coord_flip() +facet_grid(rows =vars(explanatory),cols =vars(response),labeller =labeller(explanatory =as_labeller(explanatory_labels) ) ) +theme_minimal(base_size =12) +labs(x =NULL,y =NULL ) +theme(panel.spacing.x =unit(0.1, "lines"), # Increase spacing between columnsstrip.background =element_rect(fill = strip_fill_color, color ="black"),strip.placement = strip_placement,strip.text.x =element_blank(),strip.text.y.right =element_text(angle =0,vjust =0.5,margin =margin(t =10, b =10, r =25, l =25) ),axis.text.y =element_text(size =10),panel.spacing =unit(1, "lines"),axis.text.x =element_blank(),axis.ticks.x =element_blank() )}# Function for the Gender plot (g2)plot_gender <-function(data) {ggplot(data, aes(x = explanatory_value, y = mean, group = explanatory_value)) +geom_errorbar(aes(ymin = low, ymax = high), width =0.2) +geom_point(size = set_dot_size, color ="black") +coord_flip() +facet_grid(rows =vars(explanatory),cols =vars(response),labeller =labeller(explanatory =as_labeller(explanatory_labels) ) ) +theme_minimal(base_size =12) +labs(x =NULL,y =NULL ) +theme(panel.spacing.x =unit(0.1, "lines"), # Increase spacing between columnsstrip.background =element_rect(fill = strip_fill_color, color ="black"),strip.placement = strip_placement,strip.text.x =element_blank(),strip.text.y.right =element_text(angle =0,vjust =0.5,margin =margin(t =10, b =10, r =17, l =17) ),axis.text.y =element_text(size =10),axis.text.x =element_blank(),panel.spacing =unit(1, "lines"),axis.ticks.x =element_blank() )}# Function for the Race plot (g3)plot_race <-function(data) {ggplot(data, aes(x = explanatory_value, y = mean, group = explanatory_value)) +geom_errorbar(aes(ymin = low, ymax = high), width =0.2) +geom_point(size = set_dot_size, color ="black") +coord_flip() +facet_grid(rows =vars(explanatory),cols =vars(response),labeller =labeller(explanatory =as_labeller(explanatory_labels) ) ) +theme_minimal(base_size =12) +labs(x =NULL,y =NULL ) +theme(panel.spacing.x =unit(0.1, "lines"), # Increase spacing between columnsstrip.background =element_rect(fill = strip_fill_color, color ="black"),strip.placement = strip_placement,strip.text.x =element_blank(),strip.text.y.right =element_text(angle =0,vjust =0.5,margin =margin(t =10, b =10, r =22, l =22) ),axis.text.y =element_text(size =10),panel.spacing =unit(1, "lines"),axis.text.x =element_blank(),axis.ticks.x =element_blank() )}# Function for the Ethnicity plot (g4)plot_ethnicity <-function(data, sub_title_specific) {ggplot(data, aes(x = explanatory_value, y = mean, group = explanatory_value)) +geom_errorbar(aes(ymin = low, ymax = high), width =0.2) +geom_point(size = set_dot_size, color ="black") +coord_flip() +facet_grid(rows =vars(explanatory),cols =vars(response),labeller =labeller(explanatory =as_labeller(explanatory_labels) ) ) +theme_minimal(base_size =10) +labs(x =NULL,y =NULL ) +theme(panel.spacing.x =unit(0.1, "lines"), # Increase spacing between columnsstrip.background =element_rect(fill = strip_fill_color, color ="black"),strip.placement = strip_placement,strip.text.x =element_blank(),strip.text.y.right =element_text(angle =0,vjust =0.5,margin =margin(t =10, b =10, r =18, l =18) ),axis.text.y =element_text(size =10),axis.text.x =element_blank(),axis.ticks.x =element_blank(),panel.spacing =unit(1, "lines") )}# Function for the Profession plot (g5)plot_profession <-function(data, sub_title_specific) {ggplot(data, aes(x = explanatory_value, y = mean, group = explanatory_value)) +geom_errorbar(aes(ymin = low, ymax = high), width =0.2) +geom_point(size = set_dot_size, color ="black") +coord_flip() +facet_grid(rows =vars(explanatory),cols =vars(response),labeller =labeller(explanatory =as_labeller(explanatory_labels) ) ) +theme_minimal(base_size =10) +labs(x =NULL,y =NULL ) +theme(panel.spacing.x =unit(0.1, "lines"), # Increase spacing between columnsstrip.background =element_rect(fill = strip_fill_color, color ="black"),strip.placement = strip_placement,strip.text.x =element_blank(),strip.text.y.right =element_text(angle =0,vjust =0.5,margin =margin(t =10, b =10, r =14, l =14) ),axis.text.y =element_text(size =10),axis.text.x =element_blank(),axis.ticks.x =element_blank(),panel.spacing =unit(1, "lines") )}# Function for the Vax(COVID) plot (g6)plot_vax_status <-function(data, sub_title_specific) {ggplot(data, aes(x = explanatory_value, y = mean, group = explanatory_value)) +geom_errorbar(aes(ymin = low, ymax = high), width =0.2) +geom_point(size = set_dot_size, color ="black") +coord_flip() +facet_grid(rows =vars(explanatory),cols =vars(response),labeller =labeller(explanatory =as_labeller(explanatory_labels) ) ) +theme_minimal(base_size =10) +labs(x =NULL,y =NULL ) +theme(panel.spacing.x =unit(0.1, "lines"), # Increase spacing between columnsstrip.background =element_rect(fill = strip_fill_color, color ="black"),strip.placement = strip_placement,strip.text.x =element_blank(),strip.text.y.right =element_text(angle =0,vjust =0.5,margin =margin(t =10, b =10, r =12, l =12) ),axis.text.y =element_text(size =10),axis.text.x =element_blank(),axis.ticks.x =element_blank(),panel.spacing =unit(1, "lines") )}# (g7) - flu Vxplot_flu_vax_status <-function(data, sub_title_specific) {ggplot(data, aes(x = explanatory_value, y = mean, group = explanatory_value)) +geom_errorbar(aes(ymin = low, ymax = high), width =0.2) +geom_point(size = set_dot_size, color ="black") +coord_flip() +facet_grid(rows =vars(explanatory),cols =vars(response),labeller =labeller(explanatory =as_labeller(explanatory_labels) ) ) +theme_minimal(base_size =10) +labs(x =NULL,y =paste0("Mean Likert score\n(Error bars: ", sub_title_specific, ")") ) +theme(panel.spacing.x =unit(0.1, "lines"),strip.background =element_rect(fill = strip_fill_color, color ="black"),strip.placement = strip_placement,strip.text.x =element_blank(),strip.text.y.right =element_text(angle =0,vjust =0.5,margin =margin(t =10, b =10, r =4, l =7) ),axis.text.y =element_text(size =10),axis.text.x =element_text(size =10), # <- show x-axis Likert labelsaxis.ticks.x =element_line(), # <- show x-axis tickspanel.spacing =unit(1, "lines"))}# ..... prepare the variables.# . ethnicity.filter_ethnicity_data <-function(data) { data %>%filter(explanatory =="exp_ethnicity") %>%filter(is.finite(mean), is.finite(low), is.finite(high)) %>%mutate(explanatory_value =recode(as.character(explanatory_value),"1"="Hispanic/Latino","2"="Non-Hispanic/Non-Latino"),explanatory_value =factor(explanatory_value, levels =c("Hispanic/Latino", "Non-Hispanic/Non-Latino" )),explanatory =factor(explanatory, levels =c("All", "exp_age_bin", "exp_gender", "exp_race", "exp_ethnicity" )) )}# . agefilter_age_data <-function(data) { data %>%filter(explanatory =="exp_age_bin") %>%filter(is.finite(mean), is.finite(low), is.finite(high)) %>%mutate(explanatory_value =recode(as.character(explanatory_value),"0"="<20","20"="21-25","25"="26-30","30"=">30" ),explanatory_value =factor(explanatory_value, levels =c("<20", "21-25", "26-30", ">30")),explanatory =factor(explanatory, levels =c("All", "exp_age_bin", "exp_gender", "exp_race")) )}# . genderfilter_gender_data <-function(data) { data %>%filter(explanatory =="exp_gender") %>%filter(is.finite(mean), is.finite(low), is.finite(high)) %>%mutate(explanatory_value =as.character(explanatory_value),explanatory_value =fct_recode(factor(explanatory_value),"Prefer not to say"="4","Non-binary third gender"="3","Male"="0","Female"="1" ),explanatory_value =factor(explanatory_value, levels =rev(c("Prefer not to say","Non-binary third gender","Male","Female" ))),explanatory =factor(explanatory, levels =c("All", "exp_age_bin", "exp_gender", "exp_race")) )}# . racefilter_race_data <-function(data) { data %>%filter(explanatory =="exp_race") %>%filter(is.finite(mean), is.finite(low), is.finite(high)) %>%mutate(explanatory_value =recode(as.character(explanatory_value),"1"="American Indian/Alaska Native","2"="Asian","3"="Black/African American","4"="Native Hawaiian/Other Pacific Islander","5"="White" ),explanatory_value =factor(explanatory_value, levels =rev(c("White","Native Hawaiian/Other Pacific Islander","Black/African American","Asian","American Indian/Alaska Native" ))),explanatory =factor(explanatory, levels =c("All", "exp_age_bin", "exp_gender", "exp_race")) )}# . medical profssionfilter_profession_data <-function(data) { data %>%filter(explanatory =="exp_profession") %>%filter(is.finite(mean), is.finite(low), is.finite(high)) %>%mutate(explanatory_value =recode(as.character(explanatory_value),"0"="Medical","1"="Nursing" ),explanatory_value =factor(explanatory_value, levels =c("Nursing", "Medical")),explanatory =factor(explanatory, levels =c("All", "exp_age_bin", "exp_gender", "exp_race", "exp_profession")) )}# . had covid vaxfilter_vax_status_data <-function(data) { data %>%filter(explanatory =="exp_already_vax") %>%filter(is.finite(mean), is.finite(low), is.finite(high)) %>%mutate(explanatory_value =recode(as.character(explanatory_value),"0"="No","1"="Yes" ),explanatory_value =factor(explanatory_value, levels =c("No", "Yes")),explanatory =factor(explanatory, levels =c("All", "exp_age_bin", "exp_gender", "exp_race", "exp_profession", "exp_already_vax")) )}# . had FLU vaxfilter_flu_vax_data <-function(data) { data %>%filter(explanatory =="exp_flu_vax") %>%filter(is.finite(mean), is.finite(low), is.finite(high)) %>%mutate(explanatory_value =recode(as.character(explanatory_value),"0"="No","1"="Yes" ),explanatory_value =factor(explanatory_value, levels =c("No", "Yes")),explanatory =factor(explanatory, levels =c("All", "exp_age_bin", "exp_gender", "exp_race", "exp_profession", "exp_already_vax", "exp_flu_vax")) )}```## 1 - Du Bois challenge.<div class="question-box">Du Bois challenge. Recreate the following visualization by W.E.B. Du Bois on family budgets split by income classes for 150 families in Atlanta, Georgia. This visualization was originally created using ink and watercolors.</div><div class="note-box">Note: Since there appears to be some allowable creativity with the features reperesented.I left a scale on the bottom of the parchment, and left off the 'connecting lines' connecting the same colored segments together for the stacked bar charts.It ended up being <b>a lot</b> of code - to separately construct and place all pieces of the chart together.First effort. There may be a more efficient way to re-make the plot?.I rendered the output image as html - and I cannot git rid of the small 'png 2' label (atm).</div>```{r,fig.width=12, fig.height=8, out.width="95%"}#| label: label-me-1#| echo: false#| results: asis# Load the dataincome_data <- read_csv("data/income.csv", show_col_types = FALSE)# --- New: Extract data for col_0, row_1..5 overlay table ---table_data <- income_data |> select(Class, `Actual Average` = Average_Income) |> mutate(`Actual Average` = dollar(`Actual Average`)) # Format as US Dollars# Add header rowheader_row <- tibble(Class = "Class", `Actual Average` = "ACTUAL AVERAGE")table_data <- bind_rows(header_row, table_data)# Load parchment imageinvisible(background_img <- image_read("images/parchment_sheet.png"))#- Get image dimensions ... for initial examination of how large the background image is.invisible(info <- image_info(background_img))img_width <- info$widthimg_height <- info$height# Margins and title heightmargin <- 40title_height <- 100gap_below_title <- -50# Create transparent canvas for the title texttitle_canvas <- image_blank(width = img_width, height = title_height + margin, color = "none")# Annotate the titletitle_text <- "INCOME AND EXPENDITURE OF 150 NEGRO FAMILIES IN ATLANTA,GA.,USA."title_layer <- image_annotate( title_canvas, text = title_text, size = 25, gravity = "north", location = "+0+40", font = "IM FELL English SC", weight = 700, color = "#000000BB")title_layer <- image_blur(title_layer, radius = 0.5, sigma = 0.3)composite_img <- image_composite(background_img, title_layer, offset = "+0+0")# Compute usable space for grid below titleusable_width <- img_width - 2 * marginusable_height <- img_height - margin - title_heightn_cols <- 6n_rows <- 4cell_width <- usable_width / n_colscell_height <- usable_height / n_rows# Grid positionsgrid_top <- margin + title_height + gap_below_titlegrid_bottom <- img_height - margingrid_left <- margingrid_right <- img_width - margin# Read and resize image to fit cell (0,0)annual_income_img <- image_read("images/annual_income.png")annual_income_img_resized <- image_scale(annual_income_img, geometry = paste0(cell_width, "x", cell_height, "!"))x_pos <- grid_lefty_pos <- grid_topcomposite_img <- image_composite(composite_img, annual_income_img_resized, offset = paste0("+", x_pos, "+", y_pos))# Add label with border across cols 1 to 5span_x_left <- grid_left + cell_width * 1span_x_right <- grid_left + cell_width * 6span_width <- span_x_right - span_x_leftlabel_text <- image_blank(width = span_width, height = 50, color = "none")label_text <- image_annotate( label_text, text = "ANNUAL EXPENDITURE FOR", size = 25, gravity = "center", font = "IM FELL English SC", weight = 0, color = "#000000BB")label_trimmed <- image_trim(label_text)label_padded <- image_extent(label_trimmed, geometry = paste0(span_width, "x", image_info(label_trimmed)$height), gravity = "center")label_with_border <- image_border(label_padded, color = "black", geometry = "2x2")label_height <- image_info(label_with_border)$heightlabel_y <- grid_top - label_height + 2label_x <- span_x_leftcomposite_img <- image_composite( composite_img, label_with_border, offset = paste0("+", round(label_x), "+", round(label_y)))# ................# Add column titles (RENT, FOOD, etc.)top_col_titles <- c("RENT", "FOOD", "CLOTHES", "DIRECT TAXES", "OTHER EXPENSES AND SAVINGS")title_height_area <- 15 # uniform height for all title boxesfor (i in 1:5) { title_x_left <- grid_left + (i * cell_width) title_width <- cell_width # Estimate font size to fit the column width max_text_width <- title_width - 10 # padding title_font_size <- 25 title_box_temp <- image_blank(width = title_width, height = title_height_area, color = "none") repeat { title_test <- image_annotate( title_box_temp, text = top_col_titles[i], size = title_font_size, gravity = "center", font = "Broadway", weight = 0, color = "#000000BB" ) if (image_info(title_test)$width <= max_text_width || title_font_size <= 9.5) break title_font_size <- title_font_size - 1 } title_box <- image_blank(width = title_width, height = title_height_area, color = "none") title_annotated <- image_annotate( title_box, text = top_col_titles[i], size = title_font_size, gravity = "center", font = "IM FELL English SC", weight = 0, color = "#000000BB" ) title_with_border <- image_border(title_annotated, color = "black", geometry = "2x2") title_y <- grid_top - title_height_area + 15 title_x <- title_x_left composite_img <- image_composite( composite_img, title_with_border, offset = paste0("+", round(title_x), "+", round(title_y)) )}# --- NEW: Add image icons below headers and above footers ---images_col_headers <- c( "images/rent.jpg", "images/food.png", "images/clothes.png", "images/direct_taxes.png", "images/other_expenses_savings.png")icon_height <- 132icon_gap_top <- 3icon_gap_bottom <- 3for (i in 1:5) { icon_x_left <- grid_left + (i * cell_width) icon_width <- cell_width icon_y <- grid_top + title_height_area + icon_gap_top icon_img <- image_read(images_col_headers[i]) icon_resized <- image_scale(icon_img, geometry = paste0(icon_width, "x", icon_height, "!")) icon_with_border <- image_border(icon_resized, color = "black", geometry = "2x2") composite_img <- image_composite( composite_img, icon_with_border, offset = paste0("+", round(icon_x_left), "+", round(icon_y)) )}# --- Restore column footers with colors for columns 1 to 5 ---c_palette <- c("#1B1B19", "#755D75", "#C18C7E", "#ab9f9d", "#d9e6e8")footer_height_area <- title_height_areafor (i in 1:5) { footer_x_left <- grid_left + (i * cell_width) footer_width <- cell_width footer_box <- image_blank(width = footer_width, height = footer_height_area, color = c_palette[i]) footer_with_border <- image_border(footer_box, color = "black", geometry = "2x2") footer_y <- grid_top + cell_height - footer_height_area - 4 footer_x <- footer_x_left composite_img <- image_composite( composite_img, footer_with_border, offset = paste0("+", round(footer_x), "+", round(footer_y)) )}#--------------------------------------------------------------------->>> checked#.... LEFT COLUMN TABLE ==================================>>># --- NEW: Render table in col_0, rows 0 to 6 (header + 6 rows) ---# --- UPDATED: Render table spanning from row_1 to row_5 (entire vertical height of 4 grid rows) ---library(stringr) # make sure stringr is loaded# Add horizontal scale factor (e.g., 0.85 means 85% width)horizontal_scale <- 0.91col_width <- (cell_width / 2) * horizontal_scale # Scaled column widthcell_x <- grid_leftnum_rows <- nrow(table_data)table_top_y <- grid_top + cell_height + 5 # start at row_1 - manually adjust height(s)table_total_height <- 4 * cell_height # span rows 1 to 5 (4 grid rows)header_row_height <- 20 # smaller height for headerdata_rows_height <- table_total_height - header_row_heightdata_row_height <- data_rows_height / (num_rows - 1) - 25 # divide remaining height among data rows# Define wrap width — adjust based on your cell width and font sizewrap_width <- 10# Render header rowrow_y <- table_top_y# Wrap the header cell text (Class)class_text_wrapped <- str_wrap(table_data$Class[1], width = wrap_width)class_box <- image_blank(width = col_width, height = header_row_height, color = "none")class_box <- image_annotate( class_box, text = class_text_wrapped, size = 9, gravity = "center", font = "IM FELL English SC", weight = 300, color = "#000000DD")class_box_bordered <- image_border(class_box, color = "black", geometry = "1x1")# income header as beforeincome_box <- image_blank(width = col_width, height = header_row_height, color = "none")income_box <- image_annotate( income_box, text = table_data$`Actual Average`[1], size = 7.5, gravity = "center", font = "IM FELL English SC", weight = 300, color = "#000000DD")income_box_bordered <- image_border(income_box, color = "black", geometry = "1x1")composite_img <- image_composite(composite_img, class_box_bordered, offset = paste0("+", cell_x, "+", round(row_y)))composite_img <- image_composite(composite_img, income_box_bordered, offset = paste0("+", cell_x + col_width, "+", round(row_y)))row_y <- row_y + header_row_height# Render data rows with pale brownish backgroundfor (i in 2:num_rows) { bg_col <- ifelse(i %% 2 == 0, "#E3D6BD99", "#FFFFFF00") class_bg <- image_blank(width = col_width, height = data_row_height, color = bg_col) income_bg <- image_blank(width = col_width, height = data_row_height, color = bg_col) # Wrap class cell text here class_text_wrapped <- str_wrap(table_data$Class[i], width = wrap_width) class_cell <- image_annotate( class_bg, text = class_text_wrapped, size = 9, gravity = "center", font = "IM FELL English SC", weight = 300, color = "#000000DD" ) income_cell <- image_annotate( income_bg, text = table_data$`Actual Average`[i], size = 9, gravity = "center", font = "IM FELL English SC", weight = 300, color = "#000000DD" ) class_cell_bordered <- image_border(class_cell, color = "black", geometry = "1x1") income_cell_bordered <- image_border(income_cell, color = "black", geometry = "1x1") composite_img <- image_composite(composite_img, class_cell_bordered, offset = paste0("+", cell_x, "+", round(row_y))) composite_img <- image_composite(composite_img, income_cell_bordered, offset = paste0("+", cell_x + col_width, "+", round(row_y))) row_y <- row_y + data_row_height}#... BAR CHART OVERLAY WITH CONTROLS ===============================>>># --- Adjustable Variables ---bar_chart_height_scale <- 0.75 # Controls vertical compression of bar chart (0 to 1)bar_spacing_ratio <- 0.7 # Controls thickness of bars (height of each bar)bar_chart_offset_x <- 0 # Horizontal adjustment in pixelsbar_chart_offset_y <- 5 # Vertical adjustment in pixelsbar_gap_size <- 1 # Controls vertical spacing between bar groups (not used now)# --- Data Prep ---income_long <- income_data %>% pivot_longer(cols = c("Rent", "Food", "Clothes", "Tax", "Other"), names_to = "name", values_to = "value") %>% filter(!is.na(value), value != 0) %>% mutate(name = toupper(name)) %>% mutate(textcol = ifelse(name == "RENT", "1", "0"))# --- Create spacer factor levels between Classes ---classes <- rev(unique(income_data$Class)) # Reversed order for top-to-bottom inversionspacer_levels <- paste0("SPACER_", seq_len(length(classes) - 1))# Interleave classes and spacer_levels manuallynew_levels <- character(length(classes) + length(spacer_levels))new_levels[c(TRUE, FALSE)] <- classesnew_levels[c(FALSE, TRUE)] <- spacer_levels# Assign Class_spaced with original classes firstincome_long <- income_long %>% mutate(Class_spaced = factor(as.character(Class), levels = classes))# Create spacer rows with zero value for each spacer levelspacer_rows <- data.frame( Class_spaced = factor(spacer_levels, levels = new_levels), name = "SPACER", value = 0, textcol = "0")income_augmented <- bind_rows(income_long, spacer_rows) %>% mutate(Class_spaced = factor(as.character(Class_spaced), levels = new_levels)) %>% arrange(Class_spaced, name != "SPACER")# Prepare a helper dataset for connectorsstack_order <- c("OTHER", "TAX", "CLOTHES", "FOOD", "RENT")connector_data <- income_augmented %>% filter(name != "SPACER") %>% mutate(name = factor(name, levels = stack_order)) %>% group_by(name) %>% mutate(group_index = row_number()) %>% ungroup()# Build the plotincome_plot <- ggplot(income_augmented, aes( x = Class_spaced, y = value, fill = factor(name, levels = stack_order), label = paste0(formatC(value, format = "f", digits = 1), "%"))) + geom_bar( stat = "identity", position = "stack", width = bar_spacing_ratio, color = NA, linewidth = 0.2 ) + geom_text( data = subset(income_augmented, name != "SPACER"), aes(color = textcol), position = position_stack(vjust = 0.5), size = 3, show.legend = FALSE ) + coord_flip() + scale_x_discrete(expand = expansion(add = c(0.5, 0.5))) + theme( legend.position = "none", plot.margin = margin(20 + bar_chart_offset_y, 40 + bar_chart_offset_x, 0, 20), plot.background = element_rect(fill = NA, color = NA), panel.background = element_rect(fill = NA, color = NA), legend.background = element_rect(fill = NA), legend.key = element_rect(fill = NA), axis.title = element_blank(), text = element_text(family = "mono"), axis.ticks = element_blank(), axis.line = element_blank(), panel.border = element_blank(), axis.text.y = element_blank(), plot.title = element_blank(), panel.grid = element_blank() ) + scale_fill_manual(values = c("#cbdfbd", "#8e9aaf", "#d78879", "#a08294", "#161213")) + scale_color_manual(values = c("black", "white")) + labs(fill = "", x = "", y = "")# --- Render and Position Chart ---temp_file <- tempfile(fileext = ".png")ggsave(filename = temp_file, plot = income_plot, width = 8, height = 5, dpi = 144, bg = "transparent")bar_chart_img <- image_read(temp_file)bar_chart_width <- cell_width * 5.5bar_chart_height <- cell_height * 4 * bar_chart_height_scale # Scale heightbar_chart_resized <- image_scale(bar_chart_img, paste0(bar_chart_width, "x", bar_chart_height, "!"))bar_chart_x <- grid_left + cell_width + bar_chart_offset_x - 50bar_chart_y <- grid_top + cell_height + bar_chart_offset_y# ... render images on top of each other..composite_img <- image_composite( composite_img, bar_chart_resized, offset = paste0("+", round(bar_chart_x), "+", round(bar_chart_y)))# === Add Right-Side Vertical Class Labels ===# Define the labels and how many bar rows they spanright_labels <- c("POOR", "FAIR", "COMFORTABLE", "WELL-TO-DO")row_counts <- c(2, 2, 2, 1)# Total rows = 7 actual class rows (no spacers included)total_rows <- sum(row_counts)# Compute height per row (based on the bar chart height)row_height <- bar_chart_height / total_rows# Starting Y for the bottom-most labellabel_start_y <- bar_chart_y# Label X position (to the right of bar chart)label_x <- bar_chart_x + bar_chart_width - 150 # Add paddingshift_down_pixels <- 10 # adjust as you want (positive = move down)# -for (i in seq_along(right_labels)) { label_text <- right_labels[i] rows_span <- row_counts[i] label_height <- row_height * rows_span label_y <- 5 + label_start_y + (row_height * (sum(row_counts[1:(i-1)]))) + (label_height / 2) # Move POOR up 2 rows (your existing adjustment) if (label_text == "POOR") { label_y <- label_y - 2 * row_height } # NEW: move POOR and FAIR *down* by shift_down_pixels if (label_text %in% c("POOR", "FAIR")) { label_y <- label_y + shift_down_pixels } # Create and annotate image label_img <- image_blank(width = 30, height = label_height, color = "none") label_img <- image_annotate( label_img, text = label_text, size = 11, gravity = "center", font = "IM FELL English SC", weight = 300, color = "#000000DD", degrees = 270 # Vertical text (bottom to top) ) #label_img <- image_trim(label_img) # <-- removes padding # ➕ Create brace image brace_img <- image_blank(width = 40, height = label_height, color = "none") brace_img <- image_annotate( brace_img, text = "}", # curly brace size = label_height * 0.9, # scale size to span rows gravity = "center", font = "Times", # or another serif font with a clear brace color = "#000000AA" ) # ➕ Combine brace and label horizontally combined_img <- image_append(c(brace_img, label_img)) # 🧷 Composite onto final image composite_img <- image_composite( composite_img, combined_img, offset = paste0("+", round(label_x+70), "+", round(label_y - label_height / 2)) )}#======================================>>>># Draw the grid and requested horizontal lines inside the left tablefinal_img <- image_draw(composite_img)# Thickness of cell border lines (in pixels)line_thickness <- 1# x positions: start and end of left table (two columns combined)x_start <- cell_xx_end <- cell_x + 2.8 * col_width# y positions for horizontal lines, adjustable for rows 1 to 7y_positions <- numeric(8) # 7 rows + header# row_1 (header row top)y_positions[1] <- table_top_y# row_2 to row_8 (header + 7 rows total)for (r in 2:8) { if (r == 2) { y_positions[r] <- table_top_y + header_row_height } else { y_positions[r] <- y_positions[r - 1] + data_row_height }}# Draw thin black horizontal arrows at each row boundary (except header)for (y in y_positions[-1]) { arrows( x0 = x_start, y0 = y, x1 = x_end, y1 = y, col = "black", lwd = line_thickness, length = 0.08, angle = 20, code = 2 )}# Add one extra arrow extending off the edge under last data rowy_adjust <- 69arrows( x0 = cell_x + 2 * col_width, y0 = y_positions[8] + y_adjust, x1 = cell_x + 3 * col_width - 15, y1 = y_positions[8] + y_adjust, col = "black", lwd = line_thickness, length = 0.08, angle = 20, code = 2)dev.off() # finish image_draw and save to final_img# Replace composite_img with final_img (arrows added)composite_img <- final_img#=========== OUTPUT FILE ============# Write outputinvisible(image_write(composite_img, path = "output/final_composite.png"))invisible()# Optionally display the image#print(composite_img)# Embed it explicitly into HTMLcat('<img src="output/final_composite.png" style="width:100%;">')#knitr::include_graphics("output/final_composite.png")```<div class="note-box">A recreation of 'Income and Expenditure of 150 Negro Families in Atlanta, GA, USA'<br>Some re-interpretations were taken:<br>- connecting lines between stacked bar chart areas were left off<br>- slightly modified '}' grouping labels were used on right hand side of bar charts.<br>- an axis was rendered on the bottom to help visualize scale<br></div>## 2 - COVID survey - interpretation<div class="question-box">Q2 - <b>Interpret what’s occurring in the survey, and discuss any results that go against your intuition.</b> <br>In a chart this large, <i>“interpret”</i> (as opposed to simply describing) really means identifying trends in the data.<br><b> Overall description</b>The COVID vaccine survey gathered responses from medical and nursing students across the U.S. regarding their attitudes toward vaccine safety, trust, and recommendations. The visualization arranges responses in a grid, with response variables in columns and explanatory variables (like age, profession, or gender) in rows. Each pane displays the mean Likert score and error bars between the 10th and 90th percentiles for each subgroup, offering insight into both central tendency and variability. The top row summarizes overall distributions, unconditioned by explanatory factors.<br><b>-</b><br><b>Interesting Trends in the Data:</b><br><b>1. <i>Trust and Profession:</i></b> <br>Medical students displayed more variability in their agreement with the statement “I trust the information that I have received about the vaccines” compared to nursing students. While both groups leaned toward agreement, the broader spread among medical students suggests more diverse opinions, possibly reflecting deeper exposure to varying sources of information or a more analytical approach to evaluating it.<br><b>2. <i>Concern About Side Effects and Age:</i></b> <br>Across all age groups, responses to concerns about “safety and side effects” hovered around a neutral average (Likert score ≈ 3), with relatively wide error bars. This indicates uncertainty or ambivalence. However, younger students tended to show slightly higher trust (i.e. lower concern scores), suggesting that age may play a role in perceived vaccine risk.<br><b>3. There is perhaps some <i>counter-intuition</i> at play here among nursing students who responded to the question: "Based on my understanding, I believe this vaccine is safe."</b><br> The 10–90th percentile bars span the entire Likert scale, suggesting considerable variability in responses. While this reflects some uncertainty, it may stem from the ambiguity of what <i>understanding</i> means in this context. Perhaps students interpret <i>understanding</i> as requiring a solid grasp of virology—something not all nursing students may have studied in depth. Alternatively, they may associate it with the rapid development timeline of the vaccine, leading to concerns about whether it was produced safely. It seems unlikely, however, that true denial of vaccine safety is the dominant interpretation among nursing students.<br>Overall, the data reveal meaningful variation in how medical and nursing students interpret the science and safety of COVID-19 vaccines, highlighting the complexity of attitudes even within healthcare education.</div>```{r}#| label: label-me-2#------- no code necessary ..```## 3 - COVID survey - reconstruct<div class="question-box">Q3 ....</div><details><summary>Data Analysis - Q1</summary>```{r}#| label: label-me-3-data-table#| echo: false#| fig-asp: 0.63#| fig-width: 7#| #===========# Question 3#===========# Step 1: Read CSV without headers to inspect structure# - Define "" and "NA" as missingraw_preview <-read_csv("data/covid-survey.csv",col_names =FALSE,na =c("", "NA"),show_col_types =FALSE)# - Step 1a: print the dim of the original df.library(glue)original_dim <-dim(raw_preview)cat(glue("📄 The original data frame (raw_preview) has:\n","- {original_dim[1]} rows\n","- {original_dim[2]} columns\n"))# Step 2: View to determine the row containing real column names#View(raw_preview)# Step 3: Based on inspection, set the correct `skip` value# (assume row 2 is the real header → skip = 1)survey_raw <-read_csv("data/covid-survey.csv",skip =1,na =c("", "NA"),show_col_types =FALSE)# Step 4: Dynamically get column namescol_names <-names(survey_raw)# =====================# Diagnostic Summary# =====================# - Overall % missing values (NA)total_cells <-nrow(survey_raw) *ncol(survey_raw)missing_cells <-sum(is.na(survey_raw))missing_pct_total <-round(100* missing_cells / total_cells, 2)# - % of rows with at least one NArows_with_na <- survey_raw |>filter(if_any(everything(), is.na))pct_rows_with_na <-round(100*nrow(rows_with_na) /nrow(survey_raw), 2)row_indices_with_na <-which(apply(survey_raw, 1, function(x) any(is.na(x))))# - Rows with more than one NAna_per_row <- survey_raw |>apply(1, function(x) sum(is.na(x)))rows_with_multiple_na <-which(na_per_row >1)pct_rows_with_multiple_na <-round(100*length(rows_with_multiple_na) /nrow(survey_raw), 2)# - Create diagnostic summary tabledataset_diagnostics <-tibble(Metric =c("Total % of values missing","Percent of rows with ≥1 NA","Row indices with ≥1 NA (first 20)","Percent of rows with >1 NA","Row indices with >1 NA (first 20)" ),Value =c( missing_pct_total, pct_rows_with_na,paste(head(row_indices_with_na, 20), collapse =", "), pct_rows_with_multiple_na,paste(head(rows_with_multiple_na, 20), collapse =", ") ))# ==============================# Quarto-Ready Diagnostic Table# ==============================library(kableExtra)dataset_diagnostics |>kable(caption ="<span style='font-weight:bold; font-size:1.1em;'>Table 1. Dataset Missing Value Diagnostics</span>",escape =FALSE ) |>kable_styling(full_width =FALSE,bootstrap_options =c("striped", "hover", "condensed") ) |>row_spec(which(dataset_diagnostics$Metric =="Total % of values missing"),background ="#fff3cd"# Light highlight )# =======================================# Remove rows that are entirely NA except for response_id# =======================================# - Original dataset row countoriginal_row_count <-nrow(survey_raw)# - Detect rows where all columns *except* response_id are NAcols_except_response_id <-setdiff(names(survey_raw), "response_id")rows_to_remove <-which( survey_raw |>select(all_of(cols_except_response_id)) |>apply(1, function(x) all(is.na(x))))# - Remove the rowssurvey_clean <- survey_raw[-rows_to_remove, ]# - Number of rows removedrows_removed <-length(rows_to_remove)# - User alert with clear summarycat( glue::glue("✅ Rows with only `response_id` and all other fields missing have been removed.\n","Original dataset rows: {original_row_count}\n","Rows removed: {rows_removed}\n","Cleaned dataset size: {nrow(survey_clean)} rows × {ncol(survey_clean)} columns\n" ))# - Print row numbers removed in a 4-column layoutif (rows_removed >0) {library(knitr) formatted_rows <-paste0("row:", rows_to_remove) padded_length <-ceiling(length(formatted_rows) /4) *4 formatted_rows <-c(formatted_rows, rep("", padded_length -length(formatted_rows))) removed_matrix <-matrix(formatted_rows, ncol =4, byrow =TRUE)cat("\n\n**Rows_Removed**\n")kable(removed_matrix, col.names =NULL, align ="l")}```</details>```{r}#| label: label-me-3a# - Step 1a: print the dim of the original df.original_dim <-dim(raw_preview)cat(glue("📄 The original data frame (`raw_preview`) has:\n","- {original_dim[1]} rows\n","- {original_dim[2]} columns\n\n","⚠️ Rows with no available data (i.e., only `response_id` present)\n will be removed in preprocessing.\n","\n✅ **New Dimensions** of `survey_clean` after cleaning:\n","📊 Rows: {nrow(survey_clean)}\n","📐 Columns: {ncol(survey_clean)}\n"))#-- ... --- based on info in pdf file and .csv .. encode the followingcovid_survey_longer <- survey_clean |>pivot_longer(cols =starts_with("exp_"),names_to ="explanatory",values_to ="explanatory_value" ) |>mutate(explanatory_value =as.factor(explanatory_value)) |>filter(!is.na(explanatory_value)) |>pivot_longer(cols =starts_with("resp_"),names_to ="response",values_to ="response_value" )print(covid_survey_longer)```<div class="question-box"><b>Q3 code explanation:</b><br><div class="code-container"><code>covid_survey_longer <- covid_survey |><br> pivot_longer( <br> cols = starts_with("exp_"),<br> names_to = "explanatory",<br> values_to = "explanatory_value"<br> ) |><br> filter(!is.na(explanatory_value)) |><br> pivot_longer(<br> cols = starts_with("resp_"),<br> names_to = "response",<br> values_to = "response_value"<br> )</code></div><br> <b>first pivot_longer():</b><br> Converts all columns that start with "exp_" (e.g., exp_profession, exp_gender, etc.) from wide format into long format.<br>Creates two new columns:<br>explanatory: holds the original column names (like "exp_profession")<br>explanatory_value: holds the actual values from those columns (like "Nursing" or "1")<br> <b>second pivot_longer():</b><br>After already pivoting the explanatory variables, this takes the remaining<br> response variables (resp_safety, resp_confidence_science, etc.) and pivots them long as well.<br>Creates two new columns:<br>response: original column name<br>response_value: corresponding value<br></div><div class="question-box">create the df/tibble: covid_survey_summary_stats_by_group</div>```{r}#| label: label-me-3b# - group the data - by explanatory, explanatory_value, and response calc.# - the following stats:# - mean of the response_value# - low 10th percentile of the response_value# - high 90th percentile of the response_value# - rename the df coivd_survey_summart_stats_by_groupcovid_survey_summary_stats_by_group <- covid_survey_longer |>group_by(explanatory, explanatory_value, response) |>summarise(mean =mean(response_value, na.rm =TRUE),low =quantile(response_value, probs =0.10, na.rm =TRUE),high =quantile(response_value, probs =0.90, na.rm =TRUE),.groups ="drop" )print(covid_survey_summary_stats_by_group)#View(covid_survey_summary_stats_by_group)```<div class="question-box">create the df/tibble: covid_survey_summary_stats_all</div>```{r}#| label: label-me-3clibrary(dplyr)covid_survey_summary_stats_all <- covid_survey_longer |>group_by(response) |>summarise(mean =mean(response_value, na.rm =TRUE),low =quantile(response_value, probs =0.10, na.rm =TRUE),high =quantile(response_value, probs =0.90, na.rm =TRUE),explanatory ="All",explanatory_value =factor(""),.groups ="drop" )print(covid_survey_summary_stats_all)#View(covid_survey_summary_stats_all)```<div class="question-box">Bind the two df's<br>create the df/tibble: covid_summary_of_stats</div>```{r}#| label: label-me-3d# Get existing levels from grouped dataage_levels <-levels(covid_survey_summary_stats_by_group$explanatory_value)# Add a new level to represent the 'All' groupage_levels_with_all <-c(age_levels, "")# Create the all-summary with the new factor levelcovid_survey_summary_stats_all <- covid_survey_longer |>group_by(response) |>summarise(mean =mean(response_value, na.rm =TRUE),low =quantile(response_value, probs =0.10, na.rm =TRUE),high =quantile(response_value, probs =0.90, na.rm =TRUE),explanatory ="All",explanatory_value =factor("", levels = age_levels_with_all),.groups ="drop" )# Ensure grouped summary has the same levels toocovid_survey_summary_stats_by_group$explanatory_value <-factor( covid_survey_summary_stats_by_group$explanatory_value,levels = age_levels_with_all)# Bind them safely nowcovid_survey_summary_stats <-bind_rows( covid_survey_summary_stats_all, covid_survey_summary_stats_by_group)print(covid_survey_summary_stats)write.csv(covid_survey_summary_stats, "covid_survey_summary_stats.csv", row.names =FALSE)```<div class="question-box">Q3e - recreate plot </div>```{r}#| label: label-me-3-e#| fig.width: 10 # width in inches#| fig.height: 6 # height in inches# Labels for rows (explanatory variables), including Gender and Raceexplanatory_labels <-c(All ="All",exp_age_bin ="Age",exp_gender ="Gender",exp_race ="Race" ,# Added Race labelexp_ethnicity ="Ethnicity",exp_profession ="Profession",exp_already_vax ="Had COVID\nVaccine",exp_flu_vax ="Had flu Vaccine\nthis year")# - call formatting for encoded datacovid_age_only <-filter_age_data(covid_survey_summary_stats_by_group)covid_gender_only <-filter_gender_data(covid_survey_summary_stats_by_group)covid_race_only <-filter_race_data(covid_survey_summary_stats_by_group)covid_ethnicity_only <-filter_ethnicity_data(covid_survey_summary_stats_by_group)covid_profession_only <-filter_profession_data(covid_survey_summary_stats_by_group)covid_covid_vax_only <-filter_vax_status_data(covid_survey_summary_stats_by_group)covid_flu_vax_only <-filter_flu_vax_data(covid_survey_summary_stats_by_group)# Label mappings for responseresponse_labels <-c(resp_safety ="Based on my\n understanding I\n belive the vaccine\n is safe.",resp_feel_safe_at_work ="Getting the vaccine\nwill make me feel\nsafer at work.",resp_concern_safety ="I am concerned\nabout the safety\nand side effects of\nthe vaccine.",resp_confidence_science ="I am confident in\n the scientific\n vetting process for\n the new COVID\vaccines.",resp_trust_info ="I trust the\ninformation that I\nhave received about\nthe vaccines",resp_will_recommend ="I will recommend\nthe vaccine to\nfamily, friends,\nand community\nmembers.")# Reorder response factor levels to match response_labelscovid_age_only <- covid_age_only %>%mutate(response =factor(response, levels =names(response_labels)))covid_gender_only <- covid_gender_only %>%mutate(response =factor(response, levels =names(response_labels)))# View distinct codes used in the exp_ethnicity variable# covid_survey_summary_stats_by_group %>%# filter(explanatory == "exp_ethnicity") %>%# mutate(explanatory_value = as.character(explanatory_value)) %>%# distinct(explanatory_value) %>%# arrange(explanatory_value)# - medical profession#filter_profession_data# Vector controlling heights of each row - add height for racerow_heights <-c(0.5, # - 'All' row height — adjust as needed3, # - 'exp_age_bin' row height3, # - 'exp_gender' row height - adjust as desired3, # - 'exp_race' row height - new Race row3, # - ethnicity2, # - profession2, # - Covid Vax2# -Flu Vax)# Reorder response factor levels for 'All' layercovid_all_only <- covid_survey_summary_stats_all %>%filter(is.finite(mean), is.finite(low), is.finite(high)) %>%mutate(response =factor(response, levels =names(response_labels)))# - vars for standardizing box size row/col# Define variables for strip appearancestrip_fill_color <-"gray90"strip_text_color <-"black"strip_text_size <-9strip_text_face <-"plain"strip_text_angle_x <-0strip_text_angle_y <-0strip_text_vjust_y <-0.5strip_placement <-"outside"# already used in your code# Call some functionsg0 <-plot_all(covid_all_only)# - second layer - Ageg1 <-plot_age(covid_age_only)# - third layer - genderg2 <-plot_gender(covid_gender_only)# Fourth layer - Raceg3 <-plot_race(covid_race_only)# Fifth layer: Ethnicity (if present)g4 <-plot_ethnicity(covid_ethnicity_only,"")#g5 <-plot_profession(covid_profession_only,"")#g6 <-plot_vax_status(covid_covid_vax_only,"")#g7 <-plot_flu_vax_status(covid_flu_vax_only,"Error bars in range from 10th to 90th percentile")# Composite plot with 5 layers stacked (All / Age / Gender / Race / Ethnicity)composite_plot <- (g0 / g1 / g2 / g3 / g4 / g5 /g6 /g7 +plot_layout(heights = row_heights)) &theme(plot.margin =margin(0, 0, 0, 0))print(composite_plot)```## 4 - COVID survey - re-reconstruct<div class="question-box">Q4 ....Make Plot from Q3, but use different end point quarantiles.<br>When the error bars represent the 25th and 75th percentiles instead of the 10th and 90th, the intervals become narrower, reflecting a tighter range around the median of the data. This change reduces the apparent variability and uncertainty in responses. Compared to the previous plot, the shorter error bars may make the group differences appear more precise but potentially understate the true variability. Therefore, while the overall trends remain similar, conclusions about the degree of uncertainty should be adjusted to recognize that the interquartile range excludes more extreme values.<br></div>```{r,echo=FALSE}#| label: label-me-4#| fig.width: 10 # width in inches#| fig.height: 6 # height in inches# - set quartile variablesleft_quartile <- 0.25right_quartile <- 0.75# ... repeat steps necessary to get the quartile data.covid_survey_summary_stats_by_group <- covid_survey_longer |> group_by(explanatory, explanatory_value, response) |> summarise( mean = mean(response_value, na.rm = TRUE), low = quantile(response_value, probs = left_quartile, na.rm = TRUE), high = quantile(response_value, probs = right_quartile, na.rm = TRUE), .groups = "drop" )covid_survey_summary_stats_all <- covid_survey_longer |> group_by(response) |> summarise( mean = mean(response_value, na.rm = TRUE), low = quantile(response_value, probs = left_quartile, na.rm = TRUE), high = quantile(response_value, probs = right_quartile, na.rm = TRUE), explanatory = "All", explanatory_value = factor(""), .groups = "drop" )# Create the all-summary with the new factor levelcovid_survey_summary_stats_all <- covid_survey_longer |> group_by(response) |> summarise( mean = mean(response_value, na.rm = TRUE), low = quantile(response_value, probs = left_quartile, na.rm = TRUE), high = quantile(response_value, probs = right_quartile, na.rm = TRUE), explanatory = "All", explanatory_value = factor("", levels = age_levels_with_all), .groups = "drop" )# Ensure grouped summary has the same levels toocovid_survey_summary_stats_by_group$explanatory_value <- factor( covid_survey_summary_stats_by_group$explanatory_value, levels = age_levels_with_all)# Bind them safely nowcovid_survey_summary_stats <- bind_rows( covid_survey_summary_stats_all, covid_survey_summary_stats_by_group)# .......print(covid_survey_summary_stats)# - call encoding formatting...covid_age_only <- filter_age_data(covid_survey_summary_stats_by_group)covid_gender_only <- filter_gender_data(covid_survey_summary_stats_by_group)covid_race_only <- filter_race_data(covid_survey_summary_stats_by_group)covid_ethnicity_only <- filter_ethnicity_data(covid_survey_summary_stats_by_group)covid_profession_only <- filter_profession_data(covid_survey_summary_stats_by_group)covid_covid_vax_only <-filter_vax_status_data(covid_survey_summary_stats_by_group)covid_flu_vax_only <- filter_flu_vax_data(covid_survey_summary_stats_by_group)# - call graph maker..# Call some functionsg0 <- plot_all(covid_all_only)# - second layer - Ageg1 <- plot_age(covid_age_only)# - third layer - genderg2 <- plot_gender(covid_gender_only)# Fourth layer - Raceg3 <- plot_race(covid_race_only)# Fifth layer: Ethnicity (if present)g4 <- plot_ethnicity(covid_ethnicity_only,"")#g5 <- plot_profession(covid_profession_only,"")#g6 <- plot_vax_status(covid_covid_vax_only,"")#g7 <- plot_flu_vax_status(covid_flu_vax_only,"Error bars in range from 10th to 90th")# Composite plot with 7 layers stacked (All / Age / Gender / Race / Ethnicity)# Composite plot with 5 layers stacked (All / Age / Gender / Race / Ethnicity)composite_plot_2 <- (g0 / g1 / g2 / g3 / g4 / g5 /g6 /g7 + plot_layout(heights = row_heights)) & theme(plot.margin = margin(0, 0, 0, 0))print(composite_plot_2)```## 5 - COVID survey - another view<div class="question-box"><b>Q5a ....</b><br>COVID survey - another view. Create two bar charts of the Likert data for the six survey questions in from the plot in Exercise 2. This should be a single plot visualizing the percentages of each possible answer, with different questions on the y-axis. Use an appropriate color scale.<br><br><b>a. Create a diverging bar chart. Write alt text for your visualization as well.</b><br><b>Write alt text for your visualization as well.</b></div>```{r}#| label: label-me-5a#| fig.width: 10 # width in inches#| fig.height: 6 # height in inches#| warning: false# - Using Likert Data for 6 survey questions, create a diverging bar chart.# Response labels (questions)response_labels <-c(resp_safety ="Vaccine is safe",resp_feel_safe_at_work ="Feel safer at work",resp_concern_safety ="Concern re : vaccine side effects",resp_confidence_science ="Confidence in scientific vetting",resp_trust_info ="Trust in vaccine info",resp_will_recommend ="Will recommend vaccine")# Likert response mappinglikert_scores <-c("1"="Strongly Agree","2"="Somewhat Agree","3"="Neither Agree Nor Disagree","4"="Somewhat Disagree","5"="Strongly Disagree")# Your custom viridis palette (turquoise to yellow-green)my_turquoise_to_yellow <-viridis(12,begin =0.25,end =0.85,option ="viridis")[c(1, 3, 5, 7, 9, 11)]# 1. Clean and assign numeric scores for mean computationlikert_means <- covid_survey_longer %>%filter( response %in%names(response_labels),!is.na(response_value), response_value !="" ) %>%distinct(response_id, response, .keep_all =TRUE) %>%mutate(response_label =factor(response_labels[response], levels = response_labels),response_numeric =as.numeric(response_value) ) %>%group_by(response_label) %>%summarize(mean_score =mean(response_numeric, na.rm =TRUE),sd_score =sd(response_numeric, na.rm =TRUE),n =n() ) %>%ungroup()# 2. Calculate overall mean for centeringoverall_mean <-mean(likert_means$mean_score, na.rm =TRUE)# 3. Assign color scale (optional: color by mean score direction)likert_means <- likert_means %>%mutate(centered_score = mean_score - overall_mean,direction =ifelse(centered_score <0, "positive", "negative") )# Add magma color mapping for Likert scoreslikert_colors <- viridis::viridis(5, option ="magma")# Prepare Likert labels with colorlikert_xlabels <-data.frame(likert_score =1:5,centered_score =1:5- overall_mean,color = likert_colors,label =as.character(1:5))# Pick a magma tone (e.g., middle of the scale)magma_color <- viridis::viridis(1, option ="magma")g_centered <-ggplot(likert_means, aes(x = centered_score, y =fct_rev(response_label), fill = direction)) +geom_col(width =0.6) +annotate("segment",x =0, xend =0,y =0.5, yend =6.5, # Adjust y range to control vertical spanlinetype ="dashed",color ="black",linewidth =0.8 ) +scale_x_continuous(name ="Deviation from Overall Mean Likert Score",limits =c(-1.5, 1.5),breaks =seq(-1.5, 1.5, by =0.5),sec.axis =dup_axis(name =NULL, labels =NULL) ) +scale_y_discrete(expand =expansion(mult =c(0.05, 0.32)) ) +scale_fill_manual(values =c(positive = my_turquoise_to_yellow[2], negative = my_turquoise_to_yellow[5]),name ="Likert Score Relation",labels =c(positive ="Below Mean", negative ="Above Mean") ) +labs(title ="Mean-Centered Likert Scores by Question\n",y =NULL ) +annotate("segment",x =0.45, xend =0,y =1.3, yend =1.6,color ="black",arrow =arrow(length =unit(0.2, "cm")),linewidth =0.6 ) +annotate("label",x =0.5, y =1.1,label =sprintf("Mean = %.2f", overall_mean),hjust =0,vjust =0,size =4.5,fill ="white",color ="black",label.size =0.4,label.r =unit(0.15, "lines"),label.padding =unit(0.3, "lines") ) +# Likert numbers colored with magma palettegeom_text(data = likert_xlabels,aes(x = centered_score, y =6.8, label = likert_score),inherit.aes =FALSE,size =7,color = likert_colors ) +# Likert Scores label colored in magma (middle color)annotate("text",x =0, y =7.4,label ="Likert Scores",size =7,fontface ="italic",color = likert_colors[3] ) +theme_minimal() +theme(plot.title =element_text(size =16, face ="bold", hjust =0.5),axis.text.y =element_text(size =15, lineheight =2),axis.text.x =element_text(size =14),axis.title.x.top =element_text(size =10, color ="gray30", face ="italic", hjust =1),legend.position ="bottom",legend.title =element_text(size =12, face ="bold"),legend.text =element_text(size =11),plot.margin =margin(t =10, r =10, b =10, l =10) )plot(g_centered)```<div class="note-box "> <b>Q5a - Alt text</b><br>A horizontal diverging bar chart presents mean-centered Likert scores for six COVID vaccine survey questions: “Vaccine is safe,” “Feel safer at work,” “Concern re : vaccine side effects,” “Confidence in scientific vetting,” “Trust in vaccine info,” and “Will recommend vaccine.” Each bar extends from a vertical midpoint at the mean response (1.79) to show how average responses deviate above or below the overall mean, with greater lengths indicating stronger deviation. <br>Turquoise segments indicate questions with more agreement (e.g., stronger confidence or willingness to recommend), while yellow-green segments reflect comparatively more skepticism. The chart includes an annotated mean marker, and Likert scale values from “1 - Strongly Agree” to “5 - Strongly Disagree” appear along the top axis in magma tones to reinforce the interpretive range. <br>All questions except those concerning vaccine safety and side effects tend towards '1 - Strongly Agree.' The scores for 'Vaccine is safe' and 'concerns about vaccine side effects', while above the mean, did not score into Likert-4 or Likert-5, indicating uncertainty or abmivalence but not concern.</div><div class="question-box"><b>Q5b ....</b><br>COVID survey - another view. Create two bar charts of the Likert data for the six survey questions in from the plot in Exercise 2. This should be a single plot visualizing the percentages of each possible answer, with different questions on the y-axis. Use an appropriate color scale.<br><b>b. Create a 100% bar chart</b><br><b>Write alt text for your visualization as well.</b></div>```{r}#| label: label-me-5b#| fig.width: 10 # width in inches#| fig.height: 6 # height in inches#| warning: false# Full Likert labels - for the legendlikert_labels_full <-c("1"="1 - Strongly Agree","2"="2 - Somewhat Agree","3"="3 - Neither Agree Nor Disagree","4"="4 - Somewhat Disagree","5"="5 - Strongly Disagree")# Filter and prepare proportionslikert_props <- covid_survey_longer %>%filter(response %in%names(response_labels), !is.na(response_value), response_value !="") %>%mutate(response_label =factor(response_labels[response], levels = response_labels),likert_level =factor(likert_labels_full[response_value], levels = likert_labels_full) ) %>%group_by(response_label, likert_level) %>%summarize(n =n(), .groups ="drop") %>%group_by(response_label) %>%mutate(percentage = n /sum(n) *100)# Build 100% stacked bar chart with full Likert labelsg5b <-ggplot(likert_props, aes(x = percentage, y =fct_rev(response_label), fill = likert_level)) +geom_col(width =0.7) +scale_fill_viridis_d(option ="viridis",begin =0.2,end =0.85,direction =-1,name ="Response" ) +labs(title ="Distribution of Likert Responses by Question",x ="Percentage",y =NULL ) +theme_minimal() +theme(plot.title =element_text(size =18, face ="bold", hjust =0.5),legend.position ="right",legend.title =element_text(size =12, face ="bold"),legend.text =element_text(size =11),axis.text.y =element_text(size =14),axis.text.x =element_text(size =14) )plot(g5b)```<div class="note-box "> <b>Q5b - Alt text</b><br>A horizontal stacked bar chart displays the distribution of Likert-scale responses to six survey questions on COVID vaccine attitudes, including: “Vaccine is safe,” “Feel safer at work,” “Concern re: vaccine side effects,” “Confidence in scientific vetting,” “Trust in vaccine info,” and “Will recommend vaccine.” <br>Each bar represents a question, divided into colored segments for each response level from “1 - Strongly Agree” to “5 - Strongly Disagree,” using a perceptually uniform Viridis palette. Most responses cluster on the agreement end of the scale, especially for trust and confidence statements, while the item “Concern about vaccine safety” shows a more neutral distribution centered around “Somewhat Agree” and “Neither Agree Nor Disagree.” <br>The visual design emphasizes sentiment distribution and is optimized for accessibility, including colorblind-friendly contrast and consistent labeling.</div>